Main figures
Figure 1
Figure 1 contains a map (1A) and the calibrated radiocarbon dates (1B) for 22 (out of 24) individuals sequenced in the project, as well as two individuals (Polynesian-“Botocudos”) published in 2014 (Malaspinas et al.).
Fig. 1A Map
The part of the map was made in two parts. The main part corresponds to the map of Brazil, with the second part being a small inlet of the Americas. These two maps were imported to Illustrator, and the labels, colors, and legend were adjusted there.
# png("~/Projects/Botocudos/Plots/map_DBC.png",
# width = 20, height = 15, res = 300, units = "in")
# pdf("~/Projects/Botocudos/Plots/map_DBC.pdf", width = 20, height = 15)
# layout(my_mat, widths = c(1,1), heights = c(2,2))
myColors <- c(sea = "#b8b8ff", land = "white", border = "gray90")
# pdf("~/Projects/Botocudos/Plots/Map/pre_Fig1_Americas.pdf",
# width = 10, height = 10)
map(database = "world", interior = T, boundary = F,
xlim = c(-180, -20),
ylim = c(-90, 90),
bg = as.character(myColors["sea"]),
fill = T,
col = as.character(myColors["land"]),
border = as.character(myColors["border"])
)
coords_brazil <- c(x1 = -80, x2 = -30,
y1 = -40, y2 = 10)
rect(xleft = coords_brazil["x1"],
xright = coords_brazil["x2"],
ybottom = coords_brazil["y1"],
ytop = coords_brazil["y2"],
border = "white",
col = "NA")
# dev.off()
# pdf("~/Projects/Botocudos/Plots/Map/pre_Fig1_Brazil.pdf",
# width = 10, height = 10)
coordinates_pops <- read.table(str_glue("{prefix_data}/final_inputs/Fig_1/map_brazil_pops.coordinates"))
colnames(coordinates_pops) <- c("population", "y", "x", "type")
map(database = "world", interior = T, boundary = T,
xlim = coords_brazil[c("x1", "x2")],
ylim = coords_brazil[c("y1", "y2")],
mar = c(0,0,0,0),
col = as.character(myColors["land"]),
bg = as.character(myColors["sea"]),
fill = T,
border = "gray50"
)
points(
x = coordinates_pops$x,
y = coordinates_pops$y,
pch = 1,
cex = 2
)
text(
x = coordinates_pops$x + 1 ,
y = coordinates_pops$y + 1 ,
labels = coordinates_pops$population
)
# dev.off()
Fig. 1B Calibrated dates
To reproduce the calibration curves, you need to use OxCal , then 1 - open a new project - import new plot - copy paste “plot” code from input_oxcal_sorted.txt - adjust settings as wished - save
# content of final_inputs/Fig_1/input_oxcal_sorted.txt
Plot()
{
Curve("ShCal13","ShCal13.14c");
R_Date("MN00016", 327, 24);
R_Date("MN00045", 249, 29);
R_Date("MN00064", 199, 41);
R_Date("MN0009", 185, 29);
R_Date("MN00067", 170, 21);
R_Date("MN00056", 169, 25);
R_Date("MN0003", 160, 26);
R_Date("MN00013", 150, 40);
R_Date("MN00022", 145, 30);
R_Date("MN0008", 149, 23);
R_Date("MN00023", 137, 25);
R_Date("MN00068", 121, 33);
R_Date("MN00039", 106, 30);
R_Date("MN00010", 84, 22);
R_Date("MN00069", 79, 39);
R_Date("MN00346", 76, 30);
R_Date("MN00021", 66, 50);
R_Date("MN00118", 48, 33);
R_Date("MN00316", 40, 30);
R_Date("MN00119", 23, 22);
R_Date("MN1943", 1080, 25);
R_Date("Bot15", 417, 25);
Curve("Marine13","Marine13.14c");
Delta_R("LocalMarine",33,24);
Mix_Curve("Mixed","ShCal13","LocalMarine",50,0);
R_Date("MN01701", 2035, 27);
R_Date("Bot17", 487, 25);
};
Figure 2 MDS + fastNGSadmix
mds_plot <- function(points, dim1, dim2, index, pop_color, myLegend, xlab, ylab,
xlim, text_coords,
do_plot = T, do_legend = T, main = "MDS with worldwide groups",
cex = 4,
eig = NULL, cex_text = 1){
# to use alpha()
if(do_plot){
# modify x and y axes
xlab <- paste0(
"Dimension ", data_to_plot$dim1, ": ",
percent(eig[dim1]/sum(abs(eig)), accuracy = 0.01)
)
ylab <- paste0(
"Dimension ", data_to_plot$dim2, ": ",
percent(eig[dim2]/sum(abs(eig)), accuracy = 0.01)
)
# plot axes
plot(points[,dim1], points[,dim2],
xlab = xlab, ylab = ylab,
bty = "n", main = main,
type = "n",
xlim = xlim,
cex.axis = cex_text,
cex.lab = cex_text*1.1)
title()
grid()
# add other points
points(points[ -index , dim1 ], points[ -index, dim2 ],
col = as.character(points$color[ -index ]),
bg = alpha(as.character(points$bg[ -index ]), 0.9),
pch = points$pch[ -index ],
cex = cex)
# points of interest
points(points[index, dim1], points[index, dim2],
col = as.character(points$color[index]),
bg = alpha(as.character(points$bg[index]), 0.8),
pch = points$pch[index],
cex = cex)
}
# Add text
text("Remote Oceania",
x = 0.15, y = -0.1,
col = pop_color["RemoteOceania"], font = 1, adj = 0)
text("Americas",
x = 0.09, y = 0.1,
col = pop_color["Americas"], font = 1, adj = 0)
text("New 'Botocudo' individuals", x = 0.15, y = 0,
col = pop_color["Botocudos"], font = 1, adj = 0)
text("Africa", x = -0.25, y = -0.05,
col = pop_color["Africa"], font = 1, adj = 0)
text("Europe and West Asia", x = 0.05, y = 0.17,
col = pop_color["Europe"], font = 1, adj = 0)
text("East and Southeast Asia", x = 0.15, y = -0.05,
col = pop_color["EastAsia"], font = 1, adj = 0)
text("Central and South Asia", x = -0.1, y = 0.05,
col = pop_color["CentralSouthAsia"], font = 1, adj = 0)
text("Near Oceania", x = -0.05, y = -0.08,
col = pop_color["NearOceania"], font = 1, adj = 0)
text("'Botocudo' individuals (Malaspinas et al., 2014)", x = -0.15, y = -0.11,
col = "black", font = 1, adj = 0)
# Add legend
if(do_legend){
plot(1:10, 1:10, axes = F, ylab = NA, xlab = NA, type = "n" )
legend(1,10, legend = myLegend$legend,
cex = 0.9,
pt.cex = 1.5,
col = as.character(myLegend$color),
pt.bg = as.character(myLegend$bg),
pch = myLegend$pch, bty = "n",
ncol = 1)
}
}
pdf_path <- "~/Projects/Botocudos/Plots/dummy_figures/MDS_fastNGSadmix_Wollstein_rmTrans.pdf"
png_path <- "~/Projects/Botocudos/Plots/ngsadmix/ISBA9/MDS_fastNGSadmix_Wollstein_rmTrans.png"
mds_save_path <- "~/Projects/Botocudos/Files/backup/final_inputs/Fig_2/MDS_rmTrans_data_to_plot.Rda"
ngsadmix_save_path <- "~/Projects/Botocudos/Files/backup/final_inputs/Fig_2/NGSadmix_rmTrans_data_to_plot.Rda"
load(mds_save_path)
load(ngsadmix_save_path)
#------------------------------------------------------------------------------#
# MDS and fastNGSadmix results
#pdf(pdf_path, width = 7, height = 3.5, useDingbats = F)
#png(png_path, width = round(35 / 2.54), height = round(20 / 2.54), units = "in", res = 300)
# Matrix to indicate the position of the eight plots
# (one MDS, and the admixture plot is done in several parts)
mat <- matrix(
c(
1,2,
1,3,
1,4,
1,5,
1,6,
1,7
),
byrow = T, nrow = 6)
#print(mat)
layout(mat, widths = c(5,3), heights = c(0.8,0.3,0.3,.3*6,.4,5))
#-----------------------------------------------------------
# MDS
par(mar = c(bottom=4, left=4, top=2, right=0))
do.call( mds_plot, data_to_plot )
# empty plot
par(mar = c(0,0,0,0))
plot(1,1,type="n", axes =F)
#-----------------------------------------------------------
# barplot/admixture
mar_bars = c(bottom=.5, left=0, top=.1,right=8)
# This plot is done in several sections to help with readabilitymanage the space
# One individual (Sambaqui, this study)
object_to_plot$nInds_per_block <- 1
object_to_plot$pop_breaks <- c("Sambaqui", "Sambaqui")
object_to_plot$plot(do_layout = F, add_text = F, horiz = T, mar_bars = mar_bars, srt = 0, adj = 0)
# One individual (Mummified individual from Minas Gerais, this study)
object_to_plot$nInds_per_block <- 1
object_to_plot$pop_breaks <- c("MinasGerais", "MinasGerais")
object_to_plot$plot(do_layout = F, add_text = F, horiz = T, mar_bars = mar_bars, srt = 0, adj = 0)
# Block with 24 individuals labeled as "Botocudos"
object_to_plot$pop_breaks <- c("Botocudos", "Botocudos")
object_to_plot$nInds_per_block <- 24
object_to_plot$plot(do_layout = F, add_text = F, horiz = T, mar_bars = mar_bars, srt = 0, adj = 0)
# The two individuals from Malaspinas et al., 2014
object_to_plot$nInds_per_block <- 2
object_to_plot$pop_breaks <- c("Botocudos2014", "Botocudos2014")
object_to_plot$plot(do_layout = F, add_text = F, horiz = T, mar_bars = mar_bars, srt = 0, adj = 0)
object_to_plot$qopt <- object_to_plot$qopt[
-which(object_to_plot$new_panel$population == "Botocudos2014"),
]
object_to_plot$new_panel <-
object_to_plot$new_panel[
-which(object_to_plot$new_panel$population == "Botocudos2014"),
]
object_to_plot$myAes <- object_to_plot$myAes[
-which(object_to_plot$myAes$population == "Botocudos2014"),
]
object_to_plot$pop_breaks <- c("YRI", "Totonac")
object_to_plot$plot(do_layout = F, add_text = F, horiz = T, mar_bars = mar_bars, srt = 0, adj = 0)
In this part of the code, there were two R objects that were
important to load, one stored in
MDS_rmTrans_data_to_plot.Rda and another one in
NGSadmix_rmTrans_data_to_plot.Rda.
After loading MDS_rmTrans_data_to_plot.Rda, an object
called data_to_plot will appear in the environment.
data_to_plot is a list, and one of the items in the list
is a dataframe called points. The rows in
points correspond to the individuals, and the first 606
columns correspond to each dimension of the MDS. The last columns have
the metadata of the individuals, which is useful to make queries and to
make the plot.
dim(data_to_plot$points)
## [1] 607 617
head(data_to_plot$points[,608:617])
## indID famID population label region region.1 color bg pch
## 72 F045400 F045400 Bambaran Bambaran Africa Africa #5c2c45 #5c2c45 21
## 73 F045403 F045403 Bambaran Bambaran Africa Africa #5c2c45 #5c2c45 21
## 74 F045415 F045415 Bambaran Bambaran Africa Africa #5c2c45 #5c2c45 21
## 75 F045434 F045434 Bambaran Bambaran Africa Africa #5c2c45 #5c2c45 21
## 76 F045438 F045438 Bambaran Bambaran Africa Africa #5c2c45 #5c2c45 21
## 77 F045452 F045452 Bambaran Bambaran Africa Africa #5c2c45 #5c2c45 21
## legend
## 72 Africa
## 73 Africa
## 74 Africa
## 75 Africa
## 76 Africa
## 77 Africa
Once NGSadmix_rmTrans_data_to_plot.Rda is loaded, an
object called object_to_plot will be found in the
environment. This is an object I defined to help me plotting and
modifying admixture plots as needed. However, it might not be easy to
use. If anyone needs to play with this object, I recommend using its
plot() function right after loading it, as this will
initialize many useful variables. It will also print a default admixture
plot with all the data.
The admixture proportions are stored in its qopt
dataframe. This dataframe has an extra column (width) which
helps to indicate the desired width for the genome of an individual in a
plot.
The metadata (individuals’ IDs, populations, etc.) is stored in the
new_panel dataframe. The color of each component can be
adjusted by modifying the vector colors.
load(ngsadmix_save_path)
# object_to_plot$plot()
head(object_to_plot$qopt)
## k1 k2 k3 k4 k5 k6 width
## 1 1 1e-09 1e-09 1e-09 1e-09 1e-09 1
## 2 1 1e-09 1e-09 1e-09 1e-09 1e-09 1
## 3 1 1e-09 1e-09 1e-09 1e-09 1e-09 1
## 4 1 1e-09 1e-09 1e-09 1e-09 1e-09 1
## 5 1 1e-09 1e-09 1e-09 1e-09 1e-09 1
## 6 1 1e-09 1e-09 1e-09 1e-09 1e-09 1
head(object_to_plot$new_panel)
## indID famID population label region
## 27 NA18508 NA18508 YRI Yoruba Africa
## 28 NA18858 NA18858 YRI Yoruba Africa
## 29 NA18859 NA18859 YRI Yoruba Africa
## 30 NA18517 NA18517 YRI Yoruba Africa
## 31 NA18516 NA18516 YRI Yoruba Africa
## 32 NA18523 NA18523 YRI Yoruba Africa
object_to_plot$colors
## [1] "#5c2c45" "#ffb852" "#e81900" "#b3d9a3" "#29bdad" "#b319ab" "black"
Figure 3: F3
This is a large figure, containing an F3 plot, and NGSadmix plot, and two qpGraph plots.
Fig. 3A F3
boto_f3 <- read.csv( str_glue("{prefix_data}/final_inputs/Fig_3/f3_metadata_plot.csv"))
boto_f3$Source_2 <- factor(boto_f3$Source_2, levels = boto_f3$Source_2, ordered = T)
boto_f3$disp_blocks <- cut_number(boto_f3$f_3, n = 5)
boto_f3$disp_blocks <- factor(
boto_f3$disp_blocks,
levels = rev(levels(boto_f3$disp_blocks)),
ordered = T
)
#pdf("~/Projects/Botocudos/Plots/F3/2021_06/ggplot_F3.pdf", width = 8.6, height = 3.2)
ggplot(boto_f3, aes(x = f_3, y = Source_2, color = Source_2,
xmin = f_3 - 3*std.err,
xmax = f_3 + 3*std.err)) +
geom_point(size = 0.8) +
geom_errorbar(size = 0.34, width = 0) +
scale_color_manual(values = boto_f3$color) +
facet_wrap(~disp_blocks, nrow = 1, scales = "free_y", as.table = T) +
theme_light() +
theme(legend.position = "none",
axis.text = element_text(size = 5),
strip.text.x = element_blank()) +
labs(x = "F3", y = NULL, title = NULL)
The dataframe boto_f3 stores the output of admixtools. A
few extra columns were added, such as the geographic coordinates for the
plot, whether the population is ancient, and the color of the dots.
head(boto_f3)
## X Source_1 Source_2 Target f_3 std.err Z SNPs
## 1 2223 Botocudos + Saqqaq YRI 0.207893 0.004145 50.150 21221
## 2 2322 Botocudos + USR1 YRI 0.223061 0.003208 69.528 53450
## 3 2268 Botocudos East_Greenlanders YRI 0.229675 0.002602 88.270 55594
## 4 2229 Botocudos Alaskan_Inuit YRI 0.230862 0.003054 75.585 51775
## 5 2239 Botocudos Aleutians YRI 0.237591 0.004623 51.396 20242
## 6 2250 Botocudos West_Greenlanders YRI 0.238307 0.002935 81.198 49700
## population lat long type color disp_blocks
## 1 Saqqaq 67.0 -49.0 Ancient #00F7FF [0.208,0.259]
## 2 USR1 61.0 -152.0 Ancient #FF8DFF [0.208,0.259]
## 3 East_Greenlanders 67.5 -37.9 PresentDay #FF83CF [0.208,0.259]
## 4 Alaskan_Inuit 58.3 -134.4 PresentDay #FF85BC [0.208,0.259]
## 5 Aleutians 52.0 -176.6 PresentDay #FF9824 [0.208,0.259]
## 6 West_Greenlanders 65.3 -52.0 PresentDay #FF9A00 [0.208,0.259]
Fig. 3B NGSAdmix (Americas’ panels)
load(str_glue("{prefix_data}/final_inputs/Fig_3/object_to_plot_ngsadmix.Rda"))
object_to_plot$plot(do_layout = T, add_text = T)
#str(object_to_plot)
As noted for Figure 2, we are loading an object called
object_to_plot. The dataframe qopt contains
the estimated admixture proportions, and the data for each individual
(ID, population, etc.) can be found in the new_panel
dataframe.
With this object, you can also check which population has the highest average admixture proportions for each of the components.
# load(str_glue("{prefix_data}/final_inputs/Fig_3/object_to_plot_ngsadmix.Rda"))
# object_to_plot$plot()
head(object_to_plot$qopt)
## k1 k2 k3 k4 k5 k6 k7 k8 k9 k10 k11 k12 k13
## 1 1 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09
## 2 1 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09
## 3 1 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09
## 4 1 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09
## 5 1 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09
## 6 1 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09 1e-09
## width
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
head(object_to_plot$new_panel)
## indID famID population label region
## 132 NA18486 NA18486 YRI Yoruba Africa
## 133 NA18517 NA18517 YRI Yoruba Africa
## 134 NA18861 NA18861 YRI Yoruba Africa
## 135 NA19107 NA19107 YRI Yoruba Africa
## 136 NA19121 NA19121 YRI Yoruba Africa
## 137 NA19149 NA19149 YRI Yoruba Africa
head(object_to_plot$colors)
## [1] "gray0" "#91AA55" "#A9627E" "gray30" "#294A62" "#BE8C80"
# populations with the highest average component
print(object_to_plot$max_anc_pop)
## k population ancestry
## 8 1 YRI 1.0000000
## 7 2 Papuans 1.0000000
## 1 3 Dai 0.9984098
## 2 4 French 0.9996927
## 9 5 Chipewyan 0.9154164
## 4 6 Pima 0.9712845
## 11 7 Mixe 0.9653594
## 13 8 Aymara 0.9640357
## 6 9 Cabecar 1.0000000
## 5 10 Karitiana 0.9954465
## 10 11 Surui 0.9681987
## 3 12 Guarani_KW 0.9735275
## 12 13 Xavante 0.9816969
Fig. 3C and 3D qpGraph
Figure 4 Conditional heterozygosity, PSMC, ROH
#------------------
# Conditional heterozygosity
pop_color <- c(
"#5c2c45",
"#ffb852",
"#ffa6d9",
"#e81900",
"#65A98F",
"#29bdad",
"#b319ab",
"#9C52F2",
"black"
)
names(pop_color) <- c("Africa",
"Europe",
"CentralSouthAsia",
"EastAsia",
"NearOceania",
"RemoteOceania",
"Americas",
"Ancient_Americas",
"Botocudos")
main_size <- 12
legend_size <- 8
selected_pi <- read.csv(
str_glue("{prefix_data}/final_inputs/Fig_4/conditional_het_plot.csv"),
stringsAsFactors = F
)
selected_pi$Region <- factor(
selected_pi$Region,
levels = c(
"Africa", "WestEurasia", "EastAsia", "Oceania", "Americas_Ancient",
"Americas", "Botocudos"
),
ordered = T
)
selected_pi <- selected_pi[order(selected_pi$Region, selected_pi$estimate), ]
selected_pi$Population.ID <- factor(selected_pi$Population.ID, levels = rev(selected_pi$Population.ID), ordered = T)
condHet_plot <- ggplot(selected_pi, aes(x = Population.ID, y = estimate, ymin = confint1,
ymax = confint2, color = Region, interaction = Region)) +
geom_point(size = 1.2) +
geom_errorbar(lwd = .6, width = 0.5) +
theme_minimal() +
guides(color=guide_legend(nrow=2)) +
theme(axis.text.x = element_text(angle = 90, size = 9, hjust = 1, vjust = 0.5),
legend.position = "none",
legend.text = element_text(size = legend_size),
title = element_text(size = main_size),
strip.text = element_text(size=7, margin = margin(.1, 0, .1, 0, "cm"))
) +
labs(title = "Conditional heterozygosity",
x = NULL, y = "Estimate") +
scale_color_manual(
values = as.character(
pop_color[rev(c("Botocudos", "Ancient_Americas",
"Americas",
"NearOceania", "EastAsia", "Europe",
"Africa"))])
) +
coord_trans(y = "log") +
facet_grid(.~Region, scales = "free", space = "free")
#-----------------
# ROH
pop_color <- c(
"#5c2c45",
"#ffb852",
"#e81900",
"#65A98F",
"#b319ab",
"#9C52F2",
"black"
)
names(pop_color) <- c("Africa",
"Europe",
"EastAsia",
"Oceania",
"Americas",
"Americas_Ancient",
"Botocudos")
lengths_all <- read.csv(str_glue("{prefix_data}/final_inputs/Fig_4/roh_plot_main_fig.csv"))
lengths_all <- lengths_all[lengths_all$length>0,]
lengths_all$sample <- factor(lengths_all$sample, levels = unique(lengths_all$sample), ordered = T)
region_levels <- unique(lengths_all$region)
lengths_all$region <- factor(lengths_all$region, levels = region_levels, ordered = T)
breaks_mb <- c(0, 0.5, seq(1, 32))
breaks_labels <- c("", 0.5, seq(5, 32, 5))
xlim <- c(0, 65)
ylim <- c(0, 850)
main <- "ROH distribution"
xlab <- "ROH length category (Mb)"
ylab <- "Sum (Mb)"
x <- "category" ; y <- "length"
roh_plot <- ggplot(lengths_all, aes( x = as.factor(lengths_all[, x]), y = lengths_all[, y] ,
fill = region)) +
geom_bar(stat = "identity", position = position_dodge(), color = "white") +
labs(x = xlab, y = ylab, title = main) +
coord_cartesian(xlim = c(0, sum(breaks_mb <= xlim[2])),
ylim = c(1, 1000)) +
scale_x_discrete(breaks = breaks_mb[ breaks_mb <= xlim[2]],
labels = sapply(breaks_mb, function(x) ifelse(x %in% breaks_labels, x, "")),) +
scale_y_log10() +
facet_wrap(sample~., ncol = 2
) +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(size = 8, angle = 90, hjust = 1, vjust = 0)
) +
scale_fill_manual(values = (pop_color[region_levels]))
#roh_plot
#-----------------
# PSMC
psmc <- read.csv(str_glue("{prefix_data}/final_inputs/Fig_4/psmc_plot.csv"))
psmc_plot <- ggplot(psmc, aes(x = time+1, y = popsize*10e4, color = sample)) +
geom_step(stat="identity", size = 1.) +
coord_cartesian(xlim = c(9000, 10e5)) +
scale_x_log10(labels = scales::comma,
minor_breaks = as.numeric(sapply(seq(3, 7), function(i) seq(1,10)*10**i))) +
scale_y_log10(labels = scales::comma,
minor_breaks = as.numeric(sapply(seq(3, 6), function(i) seq(1,10)*10**i))) +
theme_minimal() +
scale_color_manual(
name = "Genome",
values = alpha(c("black","#FF616B", "#A7374B", "#b319ab","#99B333", "#29BDAD"), 0.8),
breaks = c( "MN0008_L3U_trim2",
"SuruiA",
"Karitiana_HGDP_TeamAB",
"MayaG",
"A_Dai",
"A_Yoruba"),
labels = c("MN0008 (Botocudo)",
"A_Surui (Americas)",
"A_Karitiana (Americas)",
"A_Maya (Americas)",
"A_Dai (East Asia)",
"A_Yoruba (Africa)")) +
guides(color=guide_legend(nrow=2)) +
theme(legend.position = "top", #c(0.1, 0.7),
legend.text = element_text(size = legend_size),
title = element_text(size = main_size)) +
labs(title = "Effective population size",
x = "Years ago",
y = expression(N[e]))
#psmc_plot
col1_x <- 0
col2_x <- 0.6
row1_y <- 0.6
row2_y <- 0
#pdf("~/Projects/Botocudos/Plots/dummy_figures/Fig4_condHet_ROH_PSMC.pdf",
# width = 12, height = 8)
ggdraw() +
draw_plot(condHet_plot, x = col1_x, y = row1_y, width = col2_x , height = 1 - row1_y) +
draw_plot(roh_plot, col2_x, row2_y, 1 - col2_x, 1 - row2_y) +
draw_plot(psmc_plot, col1_x, row2_y, col2_x, row1_y - 0.05) +
draw_plot_label(c("A", "B", "C"), c(col1_x, col2_x, col1_x), c(1, 1, row1_y - 0.05), size = 15)
#dev.off()
head(selected_pi)
## X estimate bias stderror confint1 confint2 Population.ID ind1
## 4 252 0.243607 0 0.001376 0.240910 0.246305 San SS6004473
## 3 244 0.258621 0 0.001674 0.255340 0.261902 Mbuti HGDP00456
## 2 242 0.298109 0 0.001430 0.295306 0.300912 Mandenka SS6004470
## 1 271 0.480327 0 0.000888 0.478587 0.482067 Yoruba SS6004475
## 6 253 0.214769 0 0.001711 0.211416 0.218122 Sardinian SS6004474
## 5 226 0.215835 0 0.001530 0.212836 0.218833 French SS6004468
## ind2 Region
## 4 HGDP01029 Africa
## 3 SS6004471 Africa
## 2 HGDP01284 Africa
## 1 HGDP00927 Africa
## 6 HGDP00665 WestEurasia
## 5 HGDP00521 WestEurasia
head(lengths_all)
## X.1 X sample category length indID population region legend
## 1 2 2 A_Yoruba 0.5 751.23589 A_Yoruba Yoruba Africa A_Yoruba
## 2 3 3 A_Yoruba 1.0 422.29297 A_Yoruba Yoruba Africa A_Yoruba
## 3 4 4 A_Yoruba 2.0 38.94673 A_Yoruba Yoruba Africa A_Yoruba
## 4 5 5 A_Yoruba 3.0 34.47197 A_Yoruba Yoruba Africa A_Yoruba
## 5 8 8 A_Yoruba 6.0 6.21418 A_Yoruba Yoruba Africa A_Yoruba
## 6 16 16 A_Yoruba 14.0 14.36732 A_Yoruba Yoruba Africa A_Yoruba
head(psmc)
## X time popsize no big idea sample
## 1 1 0.000 0.06421105 4309.061 0.038302 0.109645 MN0008_L3U_trim2
## 2 2 1258.891 0.06421105 4519.083 0.040169 0.040681 MN0008_L3U_trim2
## 3 3 2634.266 0.06421105 4721.392 0.041967 0.029057 MN0008_L3U_trim2
## 4 4 4136.997 0.06421105 4912.350 0.043665 0.033278 MN0008_L3U_trim2
## 5 5 5779.047 0.08383654 3922.245 0.034864 0.028557 MN0008_L3U_trim2
## 6 6 7573.055 0.08383654 4098.051 0.036427 0.030734 MN0008_L3U_trim2
Figure 5 Demography
This figure was modified on Adobe Illustrator.
plot_name <- "twelve_models"
#png(str_glue(png_path), width = 28, height = 18, res =300, units = "in")
par(mar = c(8, 6, 2, 6))
pop_names_colors <- c("yellow", "gray", "gray", "gray", "gray", "blue", "blue", "red")
names(pop_names_colors) <- c(
"Yoruba", "ANC", "Out of Africa", "Americas",
"South America", "Karitiana", "Surui", "Botocudos"
)
load(
"~/Projects/Botocudos/Files/backup/final_inputs/Fig_5/model_values.Rda"
)
make_demography_plot(
pop_names_colors = pop_names_colors, do_log = T, max_width = 0.8,
pop_new_order = c(
"Yoruba", "ANC", "Out of Africa", "Americas",
"South America", "Karitiana", "Surui", "Botocudos"
),
pop_old_order = c("Botocudos", "Karitiana", "Surui", "Americas",
"Out of Africa", "ANC", "Yoruba", "South America"),
bottleneck_height = 0.01,
alpha = 1, main = "",
... = model_values
)
#dev.off()
Figure 6 Dstats
abba <- read.table(
str_glue("{prefix_data}/final_inputs/Fig_6/July_09_trim5.ErrorCorr.TransRem.txt"),
header = T, stringsAsFactors = F
)
pops_in_title <- c(
"Mixe",
"Surui",
"Karitiana",
"MN0008",
"MN0008_trim5",
"MN0008_L3U",
"MN0008_trim5",
"MN0008_L3U_trim2",
"MN0008_L3U_trim5",
"22Botocudos_trim5",
"LagoaSanta",
"LagoaSanta_trim5",
"Huichol",
"Aymara"
)
pops_in_y <- c(
"Mixe",
"Huichol",
"Aymara",
"Surui",
"Karitiana",
"MN0008_L3U_trim2",
"22Botocudos_trim5",
"LagoaSanta",
"LagoaSanta_trim5"
)
pops_in_h3 <- c(
"French",
"Han",
"Andaman_trim5",
"Papuan",
"Australian"
)
pop_colors <- c("deepskyblue1", "darkgoldenrod1",
"pink", "brown3", "blueviolet",
"black")
pops_in_title = c("Mixe")
pops_in_h3 <- c(
"French",
"Han",
"Andaman_trim5",
"Papuan",
"Australian"
)
pops_in_h3_labels <- c("French\n(n = 1)",
"Han\n(n = 1)",
"Andaman\n(n = 1)",
"Papuan\n(n = 1)",
"Australian\n(n = 7)")
pop_colors <- c("deepskyblue1", "darkgoldenrod1",
"pink", "brown3", "blueviolet",
"black")
pop_colors <- c("#0057BA", "#96BFE6", "#CC85D1", "#DE4500", "#9E194D", "black")
pop_colors <- c("gray60", "gray80", "#CC85D1", "#96BFE6", "#0057BA", "black")
pops_in_y <- c(
"Surui",
"LagoaSanta",
"LagoaSanta_trim5",
"LagoaSanta",
"MN0008_L3U_trim2",
"Karitiana",
"Aymara",
"Huichol"
)
new_labels = list(
MN0008_L3U_trim2 = "MN0008\n(n = 1)",
Surui = "Surui\n(n = 2)",
LagoaSanta_trim5 = "LagoaSanta\n(n = 1)",
Karitiana = "Karitiana\n(n = 4)",
Aymara = "Aymara\n(n = 1)",
Huichol = "Huichol\n(n = 1)"
)
pop <- "Mixe"
outgroup <- "Yoruba"
Dstat <- abba[(abba$H1 %in% pops_in_title |abba$H2 %in% pops_in_title) &
(abba$H3 %in% pops_in_h3),]
df_y <- as.data.frame(pops_in_y)
df_y$y_axis <- rownames(df_y)
plot_list <- list()
Dstat <- abba[(abba$H1 == pop |abba$H2 == pop) &
(abba$H3 %in% pops_in_h3),]
Dstat <- switch_h2_h1(Dstat, pop)
Dstat <- Dstat[!(Dstat$H1 %in% pops_in_h3),]
Dstat <- Dstat[Dstat$H1 %in% pops_in_y,]
for(label in names(new_labels)){
Dstat[Dstat$H1 == label,]$H1 <- new_labels[[label]]
}
Dstat$H1 <- factor(Dstat$H1,
levels = rev(unique(new_labels))
)
Dstat$H3 <- factor(Dstat$H3,
levels = c(pops_in_h3),
ordered = T)
x_label <- expression(
atop(
"Z",
paste("(", H[1], ", ", H[3], ") < --- > (Mixe, ", H[3], ")"),
paste("(Mixe, Yoruba)", " < --- > (", H[1], ", Yoruba)")
)
)
p <- ggplot(Dstat, aes(x = Z, y = H1,
fill = H3)) +
coord_cartesian(xlim = c(-4,3.5)) +
scale_fill_manual(values = alpha(pop_colors, .8),
breaks = pops_in_h3,
labels = pops_in_h3_labels,
name = expression(paste(H[3], ":"))) +
geom_vline(xintercept = c(-3.3,3.3), lty = "dashed", col = "brown", lwd = 1)+
geom_vline(xintercept = c(-4,-3,-2,-1,0,1,2,3), lty = "solid", col = "gray", lwd = 0.2) +
geom_vline(xintercept = c(-3.5,-2.5,-1.5,-0.5,0.5,1.5,2.5,3.5), lty = "dashed", col = "gray", lwd = 0.2) +
geom_point(shape = 23, size = 7, stroke= 1) +
labs(
x = x_label,
y = expression(H[1]),
title = expression(bold(paste("Z-scores for D(", H[1], ", ", H[2]," = Mixe; ", H[3], ", Yoruba)" )))) +
theme_cowplot() +
theme(legend.position = "top",
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16)
)
#pdf("~/Projects/Botocudos/Plots/dummy_figures/Dstat.pdf", width = 9.5, height = 7.5)
p
#dev.off()
Figure 7 Virome
Done
prot_names <- c(
"11-kDa", "11 kDa protein",
"7.5-kDa", "7.5 kDa protein",
"hypothetical 11kDa protein", "hypothetical protein",
"ns", "NS",
"ns1", "NS1",
"vp1", "VP1",
"VP1/2", "vp2",
"VP2", "VP3",
"protein X",
"X", "X-9 kDa protein"
)
prot_colors <- c(
rep("orange", 2), # 11 kDa
rep("plum" , 2), # 7.5 kDa
rep("#172713", 2), # hypot 11 kDa
rep("#172713", 4), # NS1
rep("#ff616b", 2), # VP1
"#de4500" , # Vp1/2
rep("#328e13", 2), # VP2
"plum4", # VP3
rep("seagreen", 3) # X
)
names(prot_colors) <- prot_names
plot_bamdamage <- function(five_prime, three_prime, titlecolor = "black", cex = 1,
cexmain = 2, txt_reads = F,
y2 = 0.55, plot.title, plot.title2 = "", myColor = c(),
legend_only = F, ncol_legend = 3, legend_cex = 1){
if (length(myColor) == 0) {
myColor <- c("royalblue", "firebrick3", "lightblue3", "indianred1",
"orange","orange3", "plum", "plum4", "paleturquoise3",
"paleturquoise4", "rosybrown2", "rosybrown3")
names(myColor) <- c("C..T", "G..A", "T..C", "A..G", "C..A", "C..G",
"T..A", "T..G", "A..C", "A..T", "G..C", "G..T")
}
if (legend_only) {
myIndex <- round(seq(1, 13, length.out = ncol_legend + 1))
for (i in 1:(length(myIndex) - 1)) {
plot(1:10, 1:10, type = "n", bty = "n", axes = F, xlab = NA, ylab = NA)
if (i == 1) {
legend(1, 10, legend = sub("\\.\\.", " to ",
names(myColor)[myIndex[i]:(myIndex[i + 1] - 1)]),
col = myColor[myIndex[i]:(myIndex[i + 1] - 1)],
lty = 1, bty = "n", cex = legend_cex, lwd = 1.6,
title = "Substitution type")
}else{
legend(1, 10, legend = sub("\\.\\.", " to ", names(myColor)[myIndex[i]:(myIndex[i + 1] - 1)]),
col = myColor[myIndex[i]:(myIndex[i + 1] - 1)],
lty = 1, bty = "n", cex = legend_cex, lwd = 1.6
)
}
}
return()
}
plot_lines <- function(damage, left = T){
plot(damage$C..T[1:25], type = "n",
bty = "n",
ylab = "Frequency", xlab = NA,
cex = cex, axes = F, ylim = c(0, y2))
if (left) {
title(main = plot.title, xlab = "5'-position", cex.main = cexmain)
axis(side = 1, at = seq(0, 25, 5))
axis(side = 2)
}else{
axis(side = 1, at = seq(1, 25, 5), labels = seq(25, 1, -5))
axis(side = 4)
title(main = plot.title2, xlab = "3'-position", cex.main = cexmain,
col.main = titlecolor)
nb_reads_legend <- paste0("Number of reads: ", txt_reads)
text(x = 12, y = 0.51, labels = nb_reads_legend, cex = 1.15)
}
for (type in names(myColor)) {
lines(damage[1:25, type, with = F],
col = alpha(myColor[type], 0.8),
lwd = 1.8,
cex = cex)
}
}
par(mar = c(4, 4, 3, 1))
plot_lines(damage = five_prime)
par(mar = c(4, 1, 3, 3))
plot_lines(damage = three_prime[25:1, ], left = F)
}
blank_plot <- function(){
par(mar = c(0,0,0,0))
plot(i, i, type = "n", bty = "n", axes = F, xlab = "", ylab = "", main = "")
}
draw_coverage <- function(coverage, plot.title, myColor,
limy){
barplot(height = coverage,
main = plot.title,
ylab = "Number of reads",
col = myColor,
border = NA,
axes = T,
ylim = c(0, limy),
space = 0,
cex.main = 1.5)
# x axis
xtick <- round(seq(1, 5000, length.out = 6))
axis(side = 1, at = xtick, labels = FALSE, tck = -0.01)
text(x = xtick, par("usr")[3], labels = paste0(xtick, "bp"), pos = 1, xpd = TRUE,
cex = 1)
}
draw_protein_boxes <- function(prots_coords, boxColors, limy, boxes_only = F){
#browser()
# Rectangles part
max_coverage = limy
if(boxes_only){
limy = -max_coverage*5
barplot(height = limy,
axes = F,
col = NA,
border = NA,
ylim = c(0,max_coverage),
xlim = c(1,5017)
)
}
space_btwn_box <- -limy/20
box_width <- -limy/20
box_top <- box_width
for (i in c(1:length(prots_coords$start))) {
if (prots_coords[i, Name] %in% c("ns", "NS", "ns1", "NS1", "vp1", "VP1", "VP1/2")) {
box_top <- box_width
} else if (prots_coords[i, Name] %in% c("VP2", "vp2")) {
box_top <- box_top + space_btwn_box
} else {
box_top <- 2 * box_top + space_btwn_box
}
rect(xleft = prots_coords[i, start],
xright = prots_coords[i, end],
ytop = box_top,
ybottom = box_top + box_width,
col = boxColors[prots_coords[i, Name]],
border = NA)
box_top <- box_width
}
}
plot_cov <- function(coverage, plot.title, myColor, boxColors, prots_coords, limy = limy,
print_coverage = T, print_boxes = T, boxes_only = F){
par(xpd = T)
if(print_coverage){
draw_coverage(coverage, plot.title, myColor, limy)
}
if(print_boxes){
draw_protein_boxes(prots_coords, boxColors, limy, boxes_only = boxes_only)
}
}
for(type in c("Cov", "Len", "Dam")){
load(file = str_glue("{prefix_data}/final_inputs/Fig_7/my{type}.Rda"))
}
my_mat <- matrix(c(15,15,1,2,
15,15,1,3,
15,15,1,4,
15,15,1,5,
15,15,1,6,
11,12,7,8,
13,13,14,14,
9,10,14,14)
, 8, 4, byrow = T)
print(my_mat)
## [,1] [,2] [,3] [,4]
## [1,] 15 15 1 2
## [2,] 15 15 1 3
## [3,] 15 15 1 4
## [4,] 15 15 1 5
## [5,] 15 15 1 6
## [6,] 11 12 7 8
## [7,] 13 13 14 14
## [8,] 9 10 14 14
# pdf("~/Projects/Botocudos/Plots/Viruses/partial_fig_8.pdf", width = 5*4, height = 12)
nf <- layout(my_mat, widths = c(1.5,1.5,2,1.5), heights = c(rep(3, 5), 1.5, 9, 9))
# layout.show(nf)
#-----------------------------------------------------------------------------#
# Coverage plots
botocudos <- c("MN00346", "MN00067", "MN00021", "MN00013", "MN00119", "MN00039")
for(boto in botocudos){
if(boto != "MN00346"){
par(mar = c(1.5,4,1.5,1), cex = 1.)
}else{
par(mar = c(1.5,4,4,2), cex = 1)
}
plot_cov(coverage = myCov[[boto]]$coverage$reads, boxColors = prot_colors,
prots_coords = myCov[[boto]]$prots, plot.title = boto,
myColor = "#9e194d",
limy = max( myCov[[boto]]$coverage$reads), print_boxes = F)
}
par(mar = c(0,4,0,3))
for(i in c(1,2)){
plot_cov(coverage = "", boxColors = prot_colors,
prots_coords = myCov[[boto]]$prots, plot.title = NA,
myColor = "#9e194d",
print_coverage = F,
print_boxes = T, boxes_only = T, limy = 50)
}
#-----------------------------------------------------------------------------#
# Damage
boto <- "MN00346"
y2 <- 0.45
nreads <- scan(str_glue("{prefix_data}/final_inputs/Fig_7/MN00346.AY083234_1_final_reads.txt"))
plot.title <- boto
plot.title2 <- ""
five_prime <- myDam[[boto]][["five_prime"]]
three_prime <- myDam[[boto]][["three_prime"]]
plot_bamdamage(five_prime = five_prime, three_prime = three_prime,
cex = 1, cexmain = 1.65, titlecolor = "black",
txt_reads = nreads, y2 = y2, plot.title = plot.title,
plot.title2 = plot.title2, myColor = c())
#-----------------------------------------------------------------------------#
#
blank_plot()
blank_plot()
#-----------------------------------------------------------------------------#
# read length
par(mar = c(5,4,1,0))
barplot(myLen$MN00346$counts,
border = F,
main = "Reads mapped to HPV B19 (MN00346)",
col = "#ff616b",
xlab = "Length (bp)",
ylab = "Counts",
xlim = c(10, 78))
axis(1, at = seq(10, 60, 10)*1.2+.5, labels = seq(30, 80, 10))
plot_bamdamage(legend_only = T, ncol_legend = 1)
## NULL
# Heatmap
plot.new()
vps <- baseViewports()
pushViewport(vps$figure)
#------------------------------------------------------------------------------#
# heatmap
boc_wide <- read.csv(
str_glue("{prefix_data}/final_inputs/Fig_7/breadth_coverage_for_plot.csv"),
row.names = 1
)
samples_groups <- sapply(rownames(boc_wide), function(ind) ifelse(ind %in% c("MN1943", "MN01701"), "Pre-Contact", "Post-Contact") )
samples_groups <- factor(samples_groups, levels = c("Pre-Contact", "Post-Contact"), ordered = T)
index <- sort(unique(unlist(sapply(boc_wide, function(i) unique(round(i*100))))))
colors100 = sequential_hcl(max(index), palette = "SunsetDark")
colors = unlist(sapply(index, function(i) colors100[i]))
colors = rev(colors)
colors[1] <- "white"
ht = Heatmap(t(as.matrix(boc_wide)),
name = "Fraction of the genome covered", #title of legend
col = colors,
show_row_dend = F,
show_column_dend = F,
column_title_side = "bottom",
column_split = samples_groups,
column_gap = unit(5, "mm"),
rect_gp = gpar(col = "gray", lty = "dotted"),
border = "black",
row_names_gp = gpar(fontsize = 12), # Text size for row names
row_names_side = c("right"),
heatmap_legend_param = list(title = "Fraction of the genome covered",
direction = "horizontal"),
bottom_annotation = HeatmapAnnotation(
df = data.frame("Period" = samples_groups),
col = list("Period" = c("Pre-Contact" = "#0057ba",
"Post-Contact" = "#d60036"))
),
width = unit(10, "cm"),
height = unit(6, "cm")
)
vp1 <-plotViewport(c(1.8,1,0,1)) ## create new vp with margins, you play with this values
par(mar = c(5,5,5,5))
p <- draw(ht, column_title = "Fraction of the genome covered at least once (viruses on the top 10%)",
heatmap_legend_side = "bottom", annotation_legend_side = "bottom",
merge_legend = T, newpage = F
)
#print(p, vp = vp1)
#dev.off()
In this part, several objects were loaded (myCov,
myLen, myDam).
myCov is a list, with each element corresponding to an
individual. For each individual, you will find a dataframe indicating
the number of reads observed at each position of the parvovirus genome.
There is also a dataframe with the coordinates of the proteins.
In myLen, each item of the list corresponds to an
individual. For each individual, there is a vector corresponding to read
lengths, and another vector counting the occurrences.
The myDam object includes, per individual, a dataframe
indicating the average frequency of substitutions along the reads,
counted from the 5’ or 3’ end.
#str(myCov)
head(myCov[["MN00346"]]$coverage)
## position reads
## 1: 1 0
## 2: 2 2
## 3: 3 4
## 4: 4 6
## 5: 5 10
## 6: 6 11
head(myCov[["MN00346"]]$prots)
## start end CDS_type Name
## 1: 323 2338 product NS1
## 2: 1797 2015 product 7.5 kDa protein
## 3: 2331 4676 product VP1
## 4: 2581 2826 product X-9 kDa protein
## 5: 3012 4676 product VP2
## 6: 4597 4881 product 11 kDa protein
#str(myLen)
head(myLen$MN00346$length)
## [1] 20 21 22 23 24 25
head(myLen$MN00346$counts)
## [1] 0 0 0 0 0 0
#str(myDam)
head(myDam$MN00346$five_prime)
## C..A G..A T..A A..C G..C T..C A..G C..G T..G
## 1: 0 0.01190476 0.00000000 0 0 0.008695652 0.009009009 0 0
## 2: 0 0.00000000 0.01010101 0 0 0.010101010 0.000000000 0 0
## 3: 0 0.00000000 0.00000000 0 0 0.018518519 0.000000000 0 0
## 4: 0 0.01000000 0.00000000 0 0 0.000000000 0.000000000 0 0
## 5: 0 0.01123596 0.00000000 0 0 0.000000000 0.008264463 0 0
## 6: 0 0.01020408 0.01052632 0 0 0.000000000 0.000000000 0 0
## A..T C..T G..T
## 1: 0.027027027 0.17582418 0
## 2: 0.018181818 0.14444444 0
## 3: 0.008130081 0.16666667 0
## 4: 0.000000000 0.20652174 0
## 5: 0.008264463 0.03571429 0
## 6: 0.026315789 0.10526316 0